# 加载class包
# if (!require(class)) install.packages("class")
library(class)
library(caret)
library(pROC)
library(MatchIt)
library(dplyr)
library(ggplot2)
library(yardstick)
library(ModelMetrics)
library(caretEnsemble)
library(plotly)

# 设置工作目录并加载数据
setwd("${path}")
mydata <- read.csv("bibliometric_nuomotu.csv")

cols<-colnames(mydata)[3:38]
independent_and<- paste0(cols,collapse = "+")
independent_dou<- paste0(cols,collapse = "\",\"")
# 确保 group_best 是因子，并且值为 "event_0" 和 "event_1"
#现在group_best 就是用户选择的
#0,1,1,0

# 确保 group_best 是因子，并且值为 "event_0" 和 "event_1"
mydata$group_best <- factor(mydata$group_best, levels = c("0", "1"), labels = c("event_0", "event_1"))


# 使用 matchit 进行 1:3 匹配
m.out <- matchit(as.formula(paste0("group_best~",independent_and) ), 
                 data = mydata, 
                 method = "nearest", 
                 ratio = 3)

# 提取匹配后的数据
matched_data <- match.data(m.out)
# 检查并处理缺失值
matched_data <- na.omit(matched_data)  # 删除包含缺失值的行
# 将 group_best 转换为数值型 (0 和 1)
matched_data$group_best_num <- as.numeric(matched_data$group_best == "event_1")
# 分离训练集和测试集（KNN需要独立的训练集和测试集）
set.seed(123)
#inTrain <- createDataPartition(matched_data$group_best, p = 0.8, list = FALSE)
#train_set <- matched_data[inTrain, ]
#test_set <- matched_data[-inTrain, ]

# 预测和评估
predictions_knn <- knn(train = matched_data[, c(cols)], 
                       test = matched_data[, c(cols)], 
                       cl = matched_data$group_best, 
                       k = 5,prob = TRUE)

# predicted_probs_knn <- sapply(predictions_knn, function(x) ifelse(x == "event_1", 0.9, 0.1))  # 简化处理
# predicted_classes_knn <- predictions_knn

attr_prob <- attributes(predictions_knn)$prob
predicted_probs_knn <- ifelse(predictions_knn == "event_1", attr_prob, 1 - attr_prob)
predicted_classes_knn <- predictions_knn



# 添加预测结果到数据集
mydata_with_predictions_knn <- matched_data %>%
  mutate(
    group_best_probability = predicted_probs_knn,
    group_best_predicted_class = predicted_classes_knn,
    group_best_score = NA  # KNN没有评分卡
  )

# 计算 group_best 和 group_best_predicted_class 相同的数据数量
matching_rows <- mydata_with_predictions_knn %>%
  filter(group_best == group_best_predicted_class) %>%
  nrow()
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", matching_rows, "\n")
# 可选：计算准确率（匹配行数 / 总行数）
total_rows <- nrow(matched_data)
accuracy <- matching_rows / total_rows
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", accuracy, "\n")
# 这是建模集的结果
write.csv(mydata_with_predictions_knn, file = "mydata_with_predictions.csv", row.names = FALSE)

try({
  
  try({
    conf_matrix <- confusionMatrix(as.factor(predicted_classes_knn), as.factor(mydata_with_predictions_knn$group_best))
    sink("confusionMatrix_knn.txt")
    print(conf_matrix)
    sink()
  })
  try({
    
  })
  try({
    #### ROC曲线 ####
    # 创建ROC曲线
    # 计算AUC并格式化输出
    # 创建ROC曲线并计算AUC
    roc_curve <- roc(response = mydata_with_predictions_knn$group_best_num, 
                     predictor = predicted_probs_knn, 
                     levels = c(0, 1))
    auc_value <- auc(roc_curve)
    auc_label <- paste0("AUC = ", formatC(auc_value, digits = 3, format = "f"))
    png(file="roc_knn.png", width=800, height=800)
    # 绘制带有AUC值和对角线的ROC曲线
    ggroc(roc_curve, legacy.axes = TRUE) +
      # 添加对角线
      annotate("segment", x = 0, xend = 1, y = 0, yend = 1, linetype = "dashed", color = "grey60") +
      
      # 添加AUC标签
      annotate("text", x = 0.7, y = 0.3, label = auc_label, size = 5, fontface = "bold") +
      
      # 设置图形标题和坐标轴标签
      ggtitle("Receiver Operating Characteristic (ROC) Curve") +
      labs(x = "False Positive Rate", y = "True Positive Rate") +
      
      # 设置主题样式
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text = element_text(size = 10),
        legend.position = "bottom",
        panel.grid.major = element_line(color = "grey90"),
        panel.grid.minor = element_blank(),
        plot.background = element_rect(fill = "white", color = NA)
      ) +
      
      # 设置线条和填充颜色
      scale_color_manual(values = "#1f78b4") +
      scale_fill_manual(values = "#1f78b4") +
      
      # 确保图形比例为1:1，使ROC曲线更加标准
      coord_fixed(ratio = 1)
    dev.off()
  })
  try({
    #### 校准图 ####
    # 创建数据框
    calibration_data <- data.frame(
      PredictedProb = predicted_probs_knn,
      TrueClass = matched_data$group_best_num
    )
    
    # 将数据按预测概率分箱（例如分成10个等宽区间）
    num_bins <- 10
    calibration_data$bin <- cut(calibration_data$PredictedProb, breaks = num_bins, include.lowest = TRUE)
    
    # 计算每个区间的平均预测概率和实际发生率
    calibration_summary <- calibration_data %>%
      group_by(bin) %>%
      summarise(
        MeanPredictedProb = mean(PredictedProb, na.rm = TRUE),
        ActualRate = mean(TrueClass, na.rm = TRUE),
        .groups = 'drop'
      )
   
  })
 
  try({
    png(file="cal_knn.png", width=800, height=800)
    # 绘制校准图
    ggplot(calibration_summary, aes(x = MeanPredictedProb, y = ActualRate)) +
      geom_point(size = 3, color = "#1f78b4") +  # 数据点
      geom_line(color = "#1f78b4", size = 1) +   # 连接线
      geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey60") +  # 对角线
      labs(title = "Calibration Plot",
           x = "Mean Predicted Probability",
           y = "Actual Rate") +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        axis.title = element_text(size = 12, face = "bold"),
        axis.text = element_text(size = 10),
        panel.grid.major = element_line(color = "grey90"),
        panel.grid.minor = element_blank(),
        plot.background = element_rect(fill = "white", color = NA)
      )
    dev.off()
  })
  try({
    #### K值选择图 ####
    # 确保加载必要的库
    # if (!require(caret)) install.packages("caret")
    library(caret)
    
    # 分离训练集和测试集（KNN需要独立的训练集和测试集）
    set.seed(123)
    inTrain <- createDataPartition(matched_data$group_best_num, p = 0.8, list = FALSE)
    train_set <- matched_data[inTrain, ]
    test_set <- matched_data[-inTrain, ]
    
    # 定义K值范围
    k_values <- seq(1, 20, by = 2)
    
    # 计算不同K值下的准确率
    accuracies <- sapply(k_values, function(k) {
      predictions_cv <- knn(
        train = train_set[, c("auth_sum", "lan_sum", "web_site_sum", "jour_sum", "pt_sum", "country_sum", "fund_sum", "unit_sum", "sum", "total", "impact_factor", "fen_qu")],
        test = test_set[, c("auth_sum", "lan_sum", "web_site_sum", "jour_sum", "pt_sum", "country_sum", "fund_sum", "unit_sum", "sum", "total", "impact_factor", "fen_qu")],
        cl = as.factor(train_set$group_best),  # 确保cl是因子类型
        k = k,
        prob = FALSE  # 不需要概率预测，只关心分类结果
      )
      
      mean(predictions_cv == as.character(test_set$group_best), na.rm = TRUE)  # 计算准确率
    })
    
    # 创建K值选择图的数据框
    k_plot_df <- data.frame(K = k_values, Accuracy = accuracies)
    
    # 创建一个更加正式的K值选择图
    knn_plot <- ggplot(k_plot_df, aes(x = K, y = Accuracy)) +
      geom_line(color = "#2c7bb6", size = 0.8) +  # 使用专业蓝作为连接线的颜色
      geom_point(color = "#d7191c", size = 3, shape = 21, fill = "white") +  # 使用红色圆圈作为数据点，内部填充白色
      labs(title = "Optimal K Selection for KNN Model",
           subtitle = "Accuracy of the model across different values of K",
           x = "K Value",
           y = "Accuracy") +
      theme_bw() +  # 使用主题黑白风格
      theme(
        plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 12, hjust = 0.5),
        axis.title = element_text(size = 14, face = "bold"),
        axis.text = element_text(size = 12),
        panel.grid.major = element_line(color = "grey85", size = 0.2),
        panel.grid.minor = element_blank(),
        plot.background = element_rect(fill = "white", color = NA),
        panel.border = element_rect(colour = "black", fill=NA, size=0.5)
      )
    png(file="knn_knn.png", width=800, height=800)
    # 显示图表
    print(knn_plot)
    dev.off()
  })
  try({
    
  })
  try({
    
  })
  try({
    
  })
  try({
    
  })
})


mydata1 <- read.csv("bibliometric_nuomotuRes.csv")
# 预测和评估
predictions_knn <- knn(train = matched_data[, c(cols)], 
                       test = mydata1[, c(cols)], 
                       cl = matched_data$group_best, 
                       k = 5)

predicted_probs_knn <- sapply(predictions_knn, function(x) ifelse(x == "event_1", 0.9, 0.1))  # 简化处理
predicted_classes_knn <- predictions_knn

# 添加预测结果到数据集
result_with_predictions_knn <- mydata1 %>%
  mutate(
    group_best_probability = NA,
    group_best_predicted_class = predicted_classes_knn,
    group_best_score = NA  # KNN没有评分卡
  )
# 这是结局的结果
write.csv(result_with_predictions_knn, file = "result_with_predictions.csv", row.names = FALSE)